list.of.packages <- c("reshape2","dplyr", "tidyr", "readr", "stringr", "plotly", "car") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
load(here::here("analyses", "meth_filter_reshaped"))
head(meth_filter_reshaped) #check to see if it's
## chr start end strand sample coverage numCs numTs percMeth population
## 1 Contig0 39226 39226 + 1 21 11 10 52.38095 HC
## 2 Contig0 39234 39234 + 1 24 10 14 41.66667 HC
## 3 Contig0 71523 71523 + 1 13 13 0 100.00000 HC
## 4 Contig0 71533 71533 + 1 17 17 0 100.00000 HC
## 5 Contig0 71542 71542 + 1 16 15 1 93.75000 HC
## 6 Contig0 71546 71546 + 1 14 14 0 100.00000 HC
slop to inclue 2kb regions on either side of gene regionsI want to identify methylated loci that are within genes but also 2kb upstream and downstream of gene regions. Therefore, before intersecting methylated loci with genes, I need to create a gene region +/- 2kb file. I will do this using slopBed.
First, generate a “genome file”, which defines size of each chromosome. This is so the slop function restricts results to within a contig. I can use the indexed FASTA file that I created for a blast.
Extract column 1 (contig name) and column 2 (# bases in the contig)
head -n 2 "../resources/Olurida_v081.fa.fai"
cut -f 1,2 "../resources/Olurida_v081.fa.fai" > "../resources/Olurida_chrom.sizes"
head -n 2 "../resources/Olurida_chrom.sizes"
## Contig0 116746 9 116746 116747
## Contig1 87411 116765 87411 87412
## Contig0 116746
## Contig1 87411
head -n 4 "../genome-features/Olurida_v081-20190709.gene.gff"
## ##gff-version 3
## Contig61093 maker gene 7493 7946 . + . ID=OLUR_00020575;Name=OLUR_00020575;Alias=maker-Contig61093-snap-gene-0.2;Note=Protein of unknown function;
## Contig1111 maker gene 24968 28696 . - . ID=OLUR_00006628;Name=OLUR_00006628;Alias=maker-Contig1111-snap-gene-0.1;Note=Similar to Spag6: Sperm-associated antigen 6 (Mus musculus OX%3D10090);Dbxref=Gene3D:G3DSA:1.25.10.10,InterPro:IPR000225,InterPro:IPR000357,InterPro:IPR011989,InterPro:IPR016024,Pfam:PF02985,SMART:SM00185,SUPERFAMILY:SSF48371;Ontology_term=GO:0005515;
## Contig214118 maker gene 201 926 . + . ID=OLUR_00032161;Name=OLUR_00032161;Alias=maker-Contig214118-snap-gene-0.0;Note=Protein of unknown function;Dbxref=Gene3D:G3DSA:3.10.450.10;
slopBed \
-i "../genome-features/Olurida_v081-20190709.gene.gff" \
-g "../resources/Olurida_chrom.sizes" \
-b 2000 \
> "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff"
head -n 3 "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff"
## Contig61093 maker gene 5493 9946 . + . ID=OLUR_00020575;Name=OLUR_00020575;Alias=maker-Contig61093-snap-gene-0.2;Note=Protein of unknown function;
## Contig1111 maker gene 22968 28792 . - . ID=OLUR_00006628;Name=OLUR_00006628;Alias=maker-Contig1111-snap-gene-0.1;Note=Similar to Spag6: Sperm-associated antigen 6 (Mus musculus OX%3D10090);Dbxref=Gene3D:G3DSA:1.25.10.10,InterPro:IPR000225,InterPro:IPR000357,InterPro:IPR011989,InterPro:IPR016024,Pfam:PF02985,SMART:SM00185,SUPERFAMILY:SSF48371;Ontology_term=GO:0005515;
## Contig214118 maker gene 1 1068 . + . ID=OLUR_00032161;Name=OLUR_00032161;Alias=maker-Contig214118-snap-gene-0.0;Note=Protein of unknown function;Dbxref=Gene3D:G3DSA:3.10.450.10;
wb: Print all lines in the second file a: input data file, which was created in previous code chunk b: save annotated gene list
intersectBed \
-wb \
-a "../analyses/meth_filter.tab" \
-b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" \
> "../analyses/meth_filter_gene.2kbslop.tab"
wc -l "../analyses/meth_filter_gene.2kbslop.tab"
## 217908 ../analyses/meth_filter_gene.2kbslop.tab
head -n 2 "../analyses/meth_filter_gene.2kbslop.tab"
## Contig0 39226 39226 + 1 21 11 10 52.38095238095239 HC Contig0 maker gene 10497 95068 . + . ID=OLUR_00000039;Name=OLUR_00000039;Alias=maker-Contig0-snap-gene-0.8;Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);Dbxref=Coils:Coil,Gene3D:G3DSA:1.25.10.10,Gene3D:G3DSA:2.130.10.10,InterPro:IPR001680,InterPro:IPR011989,InterPro:IPR015943,InterPro:IPR016024,InterPro:IPR017986,InterPro:IPR036322,MobiDBLite:mobidb-lite,Pfam:PF00400,ProSiteProfiles:PS50082,ProSiteProfiles:PS50294,SMART:SM00320,SUPERFAMILY:SSF48371,SUPERFAMILY:SSF50978;Ontology_term=GO:0005515;
## Contig0 39234 39234 + 1 24 10 14 41.66666666666667 HC Contig0 maker gene 10497 95068 . + . ID=OLUR_00000039;Name=OLUR_00000039;Alias=maker-Contig0-snap-gene-0.8;Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);Dbxref=Coils:Coil,Gene3D:G3DSA:1.25.10.10,Gene3D:G3DSA:2.130.10.10,InterPro:IPR001680,InterPro:IPR011989,InterPro:IPR015943,InterPro:IPR016024,InterPro:IPR017986,InterPro:IPR036322,MobiDBLite:mobidb-lite,Pfam:PF00400,ProSiteProfiles:PS50082,ProSiteProfiles:PS50294,SMART:SM00320,SUPERFAMILY:SSF48371,SUPERFAMILY:SSF50978;Ontology_term=GO:0005515;
Return the number of loci associated with gene regions. Not sure if this will be used, but it’s good to save it just in case.
intersectBed \
-wb \
-a "../analyses/meth_filter.tab" \
-b "../genome-features/Olurida_v081-20190709.gene.gff" \
> "../analyses/meth_filter_gene.tab"
wc -l "../analyses/meth_filter_gene.tab"
## 192834 ../analyses/meth_filter_gene.tab
– gene = contig number + start/end locus – group = sample number + gene – population = HC for Hood Canal, or SS for South Sound
head -n 3 "../analyses/meth_filter_gene.2kbslop.tab"
## Contig0 39226 39226 + 1 21 11 10 52.38095238095239 HC Contig0 maker gene 10497 95068 . + . ID=OLUR_00000039;Name=OLUR_00000039;Alias=maker-Contig0-snap-gene-0.8;Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);Dbxref=Coils:Coil,Gene3D:G3DSA:1.25.10.10,Gene3D:G3DSA:2.130.10.10,InterPro:IPR001680,InterPro:IPR011989,InterPro:IPR015943,InterPro:IPR016024,InterPro:IPR017986,InterPro:IPR036322,MobiDBLite:mobidb-lite,Pfam:PF00400,ProSiteProfiles:PS50082,ProSiteProfiles:PS50294,SMART:SM00320,SUPERFAMILY:SSF48371,SUPERFAMILY:SSF50978;Ontology_term=GO:0005515;
## Contig0 39234 39234 + 1 24 10 14 41.66666666666667 HC Contig0 maker gene 10497 95068 . + . ID=OLUR_00000039;Name=OLUR_00000039;Alias=maker-Contig0-snap-gene-0.8;Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);Dbxref=Coils:Coil,Gene3D:G3DSA:1.25.10.10,Gene3D:G3DSA:2.130.10.10,InterPro:IPR001680,InterPro:IPR011989,InterPro:IPR015943,InterPro:IPR016024,InterPro:IPR017986,InterPro:IPR036322,MobiDBLite:mobidb-lite,Pfam:PF00400,ProSiteProfiles:PS50082,ProSiteProfiles:PS50294,SMART:SM00320,SUPERFAMILY:SSF48371,SUPERFAMILY:SSF50978;Ontology_term=GO:0005515;
## Contig0 71523 71523 + 1 13 13 0 100 HC Contig0 maker gene 10497 95068 . + . ID=OLUR_00000039;Name=OLUR_00000039;Alias=maker-Contig0-snap-gene-0.8;Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);Dbxref=Coils:Coil,Gene3D:G3DSA:1.25.10.10,Gene3D:G3DSA:2.130.10.10,InterPro:IPR001680,InterPro:IPR011989,InterPro:IPR015943,InterPro:IPR016024,InterPro:IPR017986,InterPro:IPR036322,MobiDBLite:mobidb-lite,Pfam:PF00400,ProSiteProfiles:PS50082,ProSiteProfiles:PS50294,SMART:SM00320,SUPERFAMILY:SSF48371,SUPERFAMILY:SSF50978;Ontology_term=GO:0005515;
head -n 1 "../analyses/meth_filter_gene.tab"
## Contig0 39226 39226 + 1 21 11 10 52.38095238095239 HC Contig0 maker gene 12497 93068 . + . ID=OLUR_00000039;Name=OLUR_00000039;Alias=maker-Contig0-snap-gene-0.8;Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);Dbxref=Coils:Coil,Gene3D:G3DSA:1.25.10.10,Gene3D:G3DSA:2.130.10.10,InterPro:IPR001680,InterPro:IPR011989,InterPro:IPR015943,InterPro:IPR016024,InterPro:IPR017986,InterPro:IPR036322,MobiDBLite:mobidb-lite,Pfam:PF00400,ProSiteProfiles:PS50082,ProSiteProfiles:PS50294,SMART:SM00320,SUPERFAMILY:SSF48371,SUPERFAMILY:SSF50978;Ontology_term=GO:0005515;
meth_filter_genes_2kbslop <-
read_delim(file = here::here("analyses", "meth_filter_gene.2kbslop.tab"), delim = "\t", col_names = c(colnames(meth_filter_reshaped[-10]), "population", "contig_gene", "source_gene", "feature_gene", "start_gene", "end_gene", "unknown1_gene", "strand_gene", "unknown2_gene", "notes_gene")) %>%
mutate(gene=paste(contig_gene, start_gene, end_gene, sep="_")) %>%
mutate(group=paste(sample, gene, sep="_")) #%>%
## Parsed with column specification:
## cols(
## chr = col_character(),
## start = col_double(),
## end = col_double(),
## strand = col_character(),
## sample = col_double(),
## coverage = col_double(),
## numCs = col_double(),
## numTs = col_double(),
## percMeth = col_double(),
## population = col_character(),
## contig_gene = col_character(),
## source_gene = col_character(),
## feature_gene = col_character(),
## start_gene = col_double(),
## end_gene = col_double(),
## unknown1_gene = col_character(),
## strand_gene = col_character(),
## unknown2_gene = col_character(),
## notes_gene = col_character()
## )
# mutate(population=as.character(sample))
#meth_filter_genes_2kbslop$population <- car::recode(meth_filter_genes_2kbslop$population, "c('1', '2', '3', '4', '5', '6', '7', '8', '9')='HC'")
#meth_filter_genes_2kbslop$population <- car::recode(meth_filter_genes_2kbslop$population, "c('10','11','12','13','14','15','16','17','18')='SS'")
length(unique(meth_filter_genes_2kbslop$gene))
## [1] 2838
min.filt <- dplyr::count(meth_filter_genes_2kbslop, vars = c(group))
newdata <- min.filt[which(min.filt$n > 4), ]
sub_meth_table <- meth_filter_genes_2kbslop[meth_filter_genes_2kbslop$group %in% newdata$vars,]
length(unique(sub_meth_table$gene))
## [1] 793
Note: this script was created by Hollie Putnam (GM.Rmd); there are minor revisions below. I retained some commented out lines (notably-testing for position w/n gene, such as intron & exon) in case we want to include those as factors in the future.
# create data frame to stored results
results <- data.frame()
gs <- unique(sub_meth_table$gene)
#first subset the unique dataframes and second run the GLMs
for(i in 1:length(gs)){
#subset the dataframe gene by gene
sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i])
# fit glm position model
fit <- glm(matrix(c(numCs, numTs), ncol=2) ~ as.factor(population) + (1|sample),
data=sub_meth_table1, family=binomial)
a <- anova(fit, test="Chisq")
# capture summary stats to data frame
df <- data.frame(sub_meth_table1[c("population", "sample", "contig_gene", "start_gene", "end_gene", "gene", "chr", "start", "sample", "coverage", "numCs", "numTs", "percMeth", "notes_gene")],
pval.treatment = a$`Pr(>Chi)`[2],
#pval.position = a$`Pr(>Chi)`[3], #uncomment if you want to include position of CpG within a gene
#pval.treatment_x_position = a$`Pr(>Chi)`[4], #uncomment if you want to include position of CpG within a gene interaction with treatment
stringsAsFactors = F)
# bind rows of temporary data frame to the results data frame
results <- rbind(results, df)
}
results[is.na(results)] <- 0
results$adj.pval.pop <- p.adjust(results$pval.treatment, method='BH')
#results$adj.pval.position <- p.adjust(results$pval.position, method='BH') #uncomment if you want to include position of CpG within a gene
#results$adj.pval.treatment_x_position <- p.adjust(results$pval.treatment_x_position, method='BH') #uncomment if you want to include position of CpG within a gene interaction with treatment
DMGs <- subset(results, adj.pval.pop < 0.05) #safe df with differentially methylated genes
DMGs.genes <- DMGs[!duplicated(DMGs$gene), c("contig_gene", "start_gene", "end_gene", "notes_gene", "pval.treatment", "adj.pval.pop")]
length(unique(DMGs$gene))
## [1] 155
# split gene data in "notes_gene" column into separate columns
DMGs.genes <- DMGs.genes %>%
mutate(ID=str_extract(notes_gene, "ID=(.*?);"),
Parent=str_extract(notes_gene, "Parent=(.*?);"),
Name=str_extract(notes_gene, "Name=(.*?);"),
Alias=str_extract(notes_gene, "Alias=(.*?);"),
AED=str_extract(notes_gene, "AED=(.*?);"),
eAED=str_extract(notes_gene, "eAED=(.*?);"),
Note=str_extract(notes_gene, "Note=(.*?);"),
Ontology_term=str_extract(notes_gene, "Ontology_term=(.*?);"),
Dbxref=str_extract(notes_gene, "Dbxref=(.*?);")
)
write_csv(DMGs.genes, path = here::here("analyses", "DMGs.genes.csv"))
#Extract GO terms
DMGs.genes.GO <- DMGs.genes %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern="Ontology_term=",replacement = "")) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern=";",replacement = "")) %>%
separate(Ontology_term, sep=",", into=paste("GO", 1:11, sep="_")) %>%
pivot_longer(cols=c("GO_1","GO_2","GO_3","GO_4","GO_5","GO_6","GO_7","GO_8","GO_9","GO_10","GO_11"), names_to = "GO_number", values_to = "GO_term") %>%
dplyr::select(-GO_number) %>%
filter(!is.na(Note) & !is.na(GO_term))
## Warning: Expected 11 pieces. Missing pieces filled with `NA` in 66 rows [1, 2,
## 4, 5, 6, 9, 10, 11, 14, 17, 18, 19, 20, 21, 25, 31, 32, 33, 34, 39, ...].
write_delim(DMGs.genes.GO[,c("GO_term","adj.pval.pop")], path = here::here("analyses/", "DMGs.GO.txt"), delim = '\t', col_names = F) #write out df with just GO terms and p-adj values
| term_ID | description | frequency | plot_X | plot_Y | plot_size | log10 p-value | uniqueness | dispensability | representative | eliminated |
|---|---|---|---|---|---|---|---|---|---|---|
| GO:0006030 | chitin metabolic process | 0.08% | 0.696 | -7.104 | 3.994 | -4.6619 | 0.855 | 0 | 6030 | 0 |
| GO:0006886 | intracellular protein transport | 1.20% | -4.704 | 2.551 | 5.187 | -3.1954 | 0.746 | 0 | 6886 | 0 |
| GO:0030154 | cell differentiation | 1.13% | -2.491 | 7.082 | 5.162 | -2.9446 | 0.833 | 0 | 30154 | 0 |
| GO:0006457 | protein folding | 0.90% | -1.784 | -5.793 | 5.064 | -2.1316 | 0.877 | 0.035 | 6457 | 0 |
| GO:0007154 | cell communication | 7.22% | -3.262 | -6.728 | 5.967 | -1.5876 | 0.869 | 0.046 | 7154 | 0 |
| GO:0007264 | small GTPase mediated signal transduction | 0.49% | 4.131 | 4.923 | 4.794 | -2.4209 | 0.662 | 0.083 | 7264 | 0 |
| GO:0006468 | protein phosphorylation | 4.14% | 5.057 | -3.162 | 5.725 | -4.4125 | 0.687 | 0.109 | 6468 | 0 |
| GO:0006265 | DNA topological change | 0.27% | 2.434 | 0.023 | 4.536 | -2.7509 | 0.675 | 0.155 | 6265 | 0 |
| GO:0007018 | microtubule-based movement | 0.29% | -4.924 | -3.858 | 4.567 | -2.0562 | 0.842 | 0.181 | 7018 | 0 |
| GO:0070588 | calcium ion transmembrane transport | 0.16% | -5.477 | 0.479 | 4.305 | -2.2996 | 0.782 | 0.263 | 70588 | 0 |
| GO:0009408 | response to heat | 0.17% | 3.756 | 6.187 | 4.328 | -2.1316 | 0.754 | 0.315 | 9408 | 0 |
| GO:0006355 | regulation of transcription, DNA-templated | 9.92% | 4.396 | 0.755 | 6.105 | -2.195 | 0.621 | 0.355 | 6355 | 0 |
| GO:0006511 | ubiquitin-dependent protein catabolic process | 0.58% | 4.409 | -4.118 | 4.874 | -2.028 | 0.737 | 0.4 | 6511 | 0 |
| GO:0055085 | transmembrane transport | 8.92% | -5.486 | 1.791 | 6.058 | -2.2996 | 0.781 | 0.418 | 55085 | 0 |
| GO:0016579 | protein deubiquitination | 0.20% | 4.663 | -4.631 | 4.398 | -2.028 | 0.699 | 0.501 | 16579 | 0 |
| GO:0016567 | protein ubiquitination | 0.52% | null | null | 4.827 | -1.4114 | 0.697 | 0.829 | 16579 | 1 |
| GO:0007186 | G-protein coupled receptor signaling pathway | 0.88% | 3.658 | 4.986 | 5.054 | -1.6563 | 0.648 | 0.504 | 7186 | 0 |
| GO:0006811 | ion transport | 5.34% | -5.591 | 2.311 | 5.836 | -2.2996 | 0.785 | 0.535 | 6811 | 0 |
| GO:0006470 | protein dephosphorylation | 0.59% | 5.422 | -3.791 | 4.875 | -1.7904 | 0.722 | 0.568 | 6470 | 0 |
| GO:0006281 | DNA repair | 2.23% | 4.71 | 2.153 | 5.457 | -1.3059 | 0.524 | 0.577 | 6281 | 0 |
| GO:0007016 | cytoskeletal anchoring at plasma membrane | 0.01% | -2.131 | 2.783 | 2.867 | -2.9056 | 0.674 | 0.579 | 7016 | 0 |
| GO:0016560 | protein import into peroxisome matrix, docking | 0.02% | -3.674 | 1.871 | 3.287 | -2.2544 | 0.697 | 0.621 | 16560 | 0 |
| GO:0051103 | DNA ligation involved in DNA repair | 0.04% | 4.836 | 3.18 | 3.703 | -1.3059 | 0.631 | 0.628 | 51103 | 0 |
| GO:0035556 | intracellular signal transduction | 4.00% | 3.322 | 4.935 | 5.71 | -1.4274 | 0.607 | 0.638 | 35556 | 0 |
| GO:0007165 | signal transduction | 6.62% | null | null | 5.929 | -1.4769 | 0.594 | 0.847 | 35556 | 1 |
| GO:0006814 | sodium ion transport | 0.31% | -5.839 | 0.651 | 4.592 | -2.0455 | 0.784 | 0.656 | 6814 | 0 |
| GO:0006310 | DNA recombination | 1.64% | 5.722 | -0.412 | 5.323 | -1.3059 | 0.697 | 0.688 | 6310 | 0 |
# A plotting R script produced by the REVIGO server at http://revigo.irb.hr/
# If you found REVIGO useful in your work, please cite the following reference:
# Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology
# terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800
# --------------------------------------------------------------------------
# If you don't have the ggplot2 package installed, uncomment the following line:
# install.packages( "ggplot2" );
library( ggplot2 );
# --------------------------------------------------------------------------
# If you don't have the scales package installed, uncomment the following line:
# install.packages( "scales" );
library( scales );
# --------------------------------------------------------------------------
# Here is your data from REVIGO. Scroll down for plot configuration options.
revigo.names <- c("term_ID","description","frequency_%","plot_X","plot_Y","plot_size","log10_p_value","uniqueness","dispensability");
revigo.data <- rbind(c("GO:0006030","chitin metabolic process", 0.077,-0.750,-7.043, 3.994,-4.6619,0.855,0.000),
c("GO:0006886","intracellular protein transport", 1.199, 4.707, 2.562, 5.187,-3.1954,0.746,0.000),
c("GO:0030154","cell differentiation", 1.133, 2.462, 7.088, 5.162,-2.9446,0.833,0.000),
c("GO:0006457","protein folding", 0.903, 1.598,-6.661, 5.064,-2.1316,0.877,0.035),
c("GO:0007154","cell communication", 7.219, 3.773,-6.108, 5.967,-1.5876,0.869,0.046),
c("GO:0007264","small GTPase mediated signal transduction", 0.485,-4.128, 4.889, 4.794,-2.4209,0.662,0.083),
c("GO:0006468","protein phosphorylation", 4.137,-5.013,-3.169, 5.725,-4.4125,0.687,0.109),
c("GO:0006265","DNA topological change", 0.268,-2.351,-0.007, 4.536,-2.7509,0.675,0.155),
c("GO:0007018","microtubule-based movement", 0.287, 4.753,-3.755, 4.567,-2.0562,0.842,0.181),
c("GO:0070588","calcium ion transmembrane transport", 0.157, 5.497, 0.499, 4.305,-2.2996,0.782,0.263),
c("GO:0009408","response to heat", 0.166,-3.802, 6.157, 4.328,-2.1316,0.754,0.315),
c("GO:0006355","regulation of transcription, DNA-templated", 9.917,-4.355, 0.735, 6.105,-2.1950,0.621,0.355),
c("GO:0006511","ubiquitin-dependent protein catabolic process", 0.584,-4.345,-4.128, 4.874,-2.0280,0.737,0.400),
c("GO:0055085","transmembrane transport", 8.916, 5.501, 1.819, 6.058,-2.2996,0.781,0.418),
c("GO:0016579","protein deubiquitination", 0.195,-4.691,-4.605, 4.398,-2.0280,0.699,0.501),
c("GO:0007186","G-protein coupled receptor signaling pathway", 0.882,-3.647, 4.983, 5.054,-1.6563,0.648,0.504),
c("GO:0006811","ion transport", 5.344, 5.596, 2.340, 5.836,-2.2996,0.785,0.535),
c("GO:0006470","protein dephosphorylation", 0.585,-5.408,-3.778, 4.875,-1.7904,0.722,0.568),
c("GO:0006281","DNA repair", 2.234,-4.690, 2.130, 5.457,-1.3059,0.524,0.577),
c("GO:0007016","cytoskeletal anchoring at plasma membrane", 0.006, 2.140, 2.772, 2.867,-2.9056,0.674,0.579),
c("GO:0016560","protein import into peroxisome matrix, docking", 0.015, 3.678, 1.849, 3.287,-2.2544,0.697,0.621),
c("GO:0051103","DNA ligation involved in DNA repair", 0.039,-4.819, 3.156, 3.703,-1.3059,0.631,0.628),
c("GO:0035556","intracellular signal transduction", 4.000,-3.323, 4.909, 5.710,-1.4274,0.607,0.638),
c("GO:0006814","sodium ion transport", 0.305, 5.860, 0.675, 4.592,-2.0455,0.784,0.656),
c("GO:0006310","DNA recombination", 1.641,-5.673,-0.433, 5.323,-1.3059,0.697,0.688));
one.data <- data.frame(revigo.data);
names(one.data) <- revigo.names;
one.data <- one.data [(one.data$plot_X != "null" & one.data$plot_Y != "null"), ];
one.data$plot_X <- as.numeric( as.character(one.data$plot_X) );
one.data$plot_Y <- as.numeric( as.character(one.data$plot_Y) );
one.data$plot_size <- as.numeric( as.character(one.data$plot_size) );
one.data$log10_p_value <- as.numeric( as.character(one.data$log10_p_value) );
one.data$frequency <- as.numeric( as.character(one.data$frequency) );
one.data$uniqueness <- as.numeric( as.character(one.data$uniqueness) );
one.data$dispensability <- as.numeric( as.character(one.data$dispensability) );
#head(one.data);
# --------------------------------------------------------------------------
# Names of the axes, sizes of the numbers and letters, names of the columns,
# etc. can be changed below
p1 <- ggplot( data = one.data );
p1 <- p1 + geom_point( aes( plot_X, plot_Y, colour = log10_p_value, size = plot_size), alpha = I(0.6) ) + scale_size_area();
p1 <- p1 + scale_colour_gradientn( colours = c("blue", "green", "yellow", "red"), limits = c( min(one.data$log10_p_value), 0) );
p1 <- p1 + geom_point( aes(plot_X, plot_Y, size = plot_size), shape = 21, fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area();
p1 <- p1 + scale_size( range=c(5, 30)) + theme_bw(); # + scale_fill_gradientn(colours = heat_hcl(7), limits = c(-300, 0) );
#ex <- one.data [ one.data$dispensability < 0.15, ];
p1 <- p1 + geom_text( data = one.data, aes(plot_X, plot_Y, label = description), colour = I(alpha("black", 0.85)), size = 3 ); #changed data from "ex" to "one.data"
p1 <- p1 + labs (y = "semantic space x", x = "semantic space y");
p1 <- p1 + theme(legend.key = element_blank()) ;
one.x_range = max(one.data$plot_X) - min(one.data$plot_X);
one.y_range = max(one.data$plot_Y) - min(one.data$plot_Y);
p1 <- p1 + xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10);
p1 <- p1 + ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10);
# --------------------------------------------------------------------------
# Output the plot to screen
p1;
# Uncomment the line below to also save the plot to a file.
# The file type depends on the extension (default=pdf).
# ggsave("C:/Users/path_to_your_file/revigo-plot.pdf");
Create bed files to visualze as a track of DMGs in IGV
DMGs.bed <- meth_filter_genes_2kbslop %>%
filter(contig_gene %in% DMGs.genes$contig_gene &
start_gene %in% DMGs.genes$start_gene &
end_gene %in% DMGs.genes$end_gene) %>%
dplyr::select(contig_gene, start_gene, end_gene,
unknown1_gene, strand_gene, unknown2_gene, notes_gene) %>%
distinct(contig_gene, start_gene, end_gene)
#DMGs.bed <- DMGs.bed[!duplicated(test$contig_gene),]
readr::write_delim(DMGs.bed, "../analyses/DMGs.bed", delim = '\t', col_names = FALSE)
ggplotly(meth_filter_genes_2kbslop %>%
filter(contig_gene %in% DMGs.genes$contig_gene &
start_gene %in% DMGs.genes$start_gene &
end_gene %in% DMGs.genes$end_gene) %>%
group_by(population, gene) %>%
summarise(allCs_percent = 100*(sum(numCs)/sum(coverage)),
mean_percentMeth = mean(percMeth)) %>%
ggplot(aes(x = population, y = mean_percentMeth, fill = population)) + geom_bar(stat="identity") + facet_wrap(~gene) + theme_light() + scale_fill_manual(values=c("firebrick3","dodgerblue3")))
## Warning: Removed 271 rows containing missing values (position_stack).
#checking to make sure numCs + numTs = coverage; should be 1:1 line
plot(meth_filter_genes_2kbslop$numCs + meth_filter_genes_2kbslop$numTs ~ meth_filter_genes_2kbslop$coverage)
## Look at coverage for each DMG by population (mean % methylation across samples)
ggplotly(meth_filter_genes_2kbslop %>%
filter(contig_gene %in% DMGs.genes$contig_gene &
start_gene %in% DMGs.genes$start_gene &
end_gene %in% DMGs.genes$end_gene) %>%
group_by(population, gene) %>%
summarise(sum_cov = sum(coverage),
mean_cov = mean(coverage)) %>%
ggplot(aes(x = population, y = mean_cov, fill = population)) +
geom_bar(stat="identity") +
facet_wrap(~gene) + theme_light() + scale_fill_manual(values=c("firebrick3","dodgerblue3")))
## Warning: Removed 271 rows containing missing values (position_stack).
DMG_counts <- meth_filter_genes_2kbslop %>%
filter(contig_gene %in% DMGs.genes$contig_gene &
start_gene %in% DMGs.genes$start_gene &
end_gene %in% DMGs.genes$end_gene)
# Look at coverage for each DMG locus, by population
# mean % methylation across samples
DMG_genes_unique <- unique(DMG_counts$gene)
for (i in 1:length(DMG_genes_unique)) {
temp <- DMG_counts %>%
filter(chr == "Contig22489") %>%
group_by(population, chr, start) %>%
summarise(allCs_percent = 100*(sum(numCs)/sum(coverage)),
mean_percentMeth = mean(percMeth))
print(ggplotly(ggplot(temp, aes(x = population, y = mean_percentMeth, fill = population)) +
geom_bar(stat="identity") +
facet_wrap(~start) +
theme_light() + ggtitle(paste("gene = ", "Contig22489", sep="")) +
scale_fill_manual(values=c("firebrick3","dodgerblue3"))))
}
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (position_stack).
intersectBed \
-wb \
-a "../analyses/DMGs.bed" \
-b "../analyses/dml25.bed" \
> "../analyses/DMGs-with-DMLs.tab"
wc -l "../analyses/DMGs-with-DMLs.tab"
## 48 ../analyses/DMGs-with-DMLs.tab
dml25 <- read_delim(file = here::here("analyses", "dml25.bed"), delim = "\t")
## Parsed with column specification:
## cols(
## Contig100994 = col_character(),
## `3427` = col_double(),
## `3429` = col_double(),
## `-31` = col_double()
## )
DMLs.in.DMGs <-
read_delim(file = here::here("analyses", "DMGs-with-DMLs.tab"), delim = "\t", col_names = c(colnames(DMGs.bed), colnames(dml25))) #%>%
## Parsed with column specification:
## cols(
## contig_gene = col_character(),
## start_gene = col_double(),
## end_gene = col_double(),
## Contig100994 = col_character(),
## `3427` = col_double(),
## `3429` = col_double(),
## `-31` = col_double()
## )
#mutate(gene=paste(contig_gene, start_gene, sep="_"))
write.csv(DMLs.in.DMGs, file = "../analyses/DMLS.in.DMGS.csv")
# Barplots of all DMLs also located in DMGs
DMLs.in.DMGs.calcs <- meth_filter_reshaped %>%
filter(chr %in% DMLs.in.DMGs$contig_gene &
start %in% DMLs.in.DMGs$start_gene+1 &
end %in% DMLs.in.DMGs$end_gene-1) %>%
group_by(population, chr, start) %>%
dplyr::summarise(
mean_percMeth = mean(percMeth, na.rm=TRUE),
sd_percMeth=sd(percMeth, na.rm=TRUE),
n())
DMLs.in.DMGs.calcs %>% ungroup() %>% select(chr, start) %>% distinct()
## # A tibble: 744 x 2
## chr start
## <chr> <int>
## 1 Contig128000 3308
## 2 Contig128000 3370
## 3 Contig128000 3425
## 4 Contig128000 3442
## 5 Contig128000 3509
## 6 Contig128000 5085
## 7 Contig128000 5228
## 8 Contig128000 5237
## 9 Contig128000 5285
## 10 Contig128000 5297
## # … with 734 more rows
DMLs_in_DMGs_plots <- list()
for (i in 1:nrow(DMLs.in.DMGs)) {
test <- DMLs.in.DMGs.calcs %>%
filter(chr==DMLs.in.DMGs$contig_gene[i] &
start==DMLs.in.DMGs$start_gene[i]+1)
DMLs_in_DMGs_plots[[i]] <- ggplot(test, aes(x = population, y = mean_percMeth, fill = population,
label=paste0(round(mean_percMeth, digits = 2), "%"))) +
geom_bar(stat="identity", width = 0.5) + ylim(0,110) +
geom_pointrange(aes(ymin=mean_percMeth,
ymax=mean_percMeth+sd_percMeth, width=0.15)) +
geom_text(size=3, vjust=-0.5, hjust=1.25) +
theme_light() + ggtitle(paste("Contig = ", DMLs.in.DMGs[i, "contig_gene"], ", Locus = ",
DMLs.in.DMGs[i+1, "start_gene"], sep="")) +
scale_fill_manual(values=c("firebrick3","dodgerblue3"))
}
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
## Warning: Ignoring unknown aesthetics: width
DMLs_in_DMGs_plots
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
## Warning: Removed 1 rows containing missing values (geom_pointrange).
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
## Warning: Removed 1 rows containing missing values (geom_pointrange).
##
## [[44]]
##
## [[45]]
##
## [[46]]
##
## [[47]]
##
## [[48]]